knitr::opts_chunk$set(echo = TRUE)
library(jsonlite) # allows us to read in json files
library(tidyverse) # allows us to do lots of data manipulation and basic data science
library(here) # allows us to cut out long file paths (ex. "users/connor/dowloads/etc")
library(forcats) #
library(tidytext) # allows us to tokenize data
library(dplyr) # allows us to manipulate dataframes
library(stringr) # allows us to count the number of words in a cell
library(quanteda) # allows us to tokenize data
library(quanteda.textplots) # allows us to make network plots
library(gridExtra) # allows us to combine multiple plots into 1
library(wordcloud) # allows us to generate word clouds
library(fmsb)
library(plotly) #interactive ggplot graphs
library(ggthemes) # more themes for ggplot
library(tm) #for textmining corpus(), removePunctuation()
library(syuzhet) # for sentiment analysis, getNrc()
library(wordcloud2) # for comparison clouds
library(plotrix) # for pyramid plots
library(RColorBrewer) # for more color palettes
nature_analysis <- read_csv(here("data/training.csv"))
## Rows: 23436 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): text, claim
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Visual 1 Filter() to select Super-claims 1, 3, & 5 Select() text columns
na_1 <- nature_analysis %>%
filter(str_detect(claim, "1_"))
na_3 <- nature_analysis %>%
filter(str_detect(claim, "3_"))
na_5 <- nature_analysis %>%
filter(str_detect(claim, "5_"))
na_1_claims <- na_1 %>%
select(text)
na_3_claims <- na_3 %>%
select(text)
na_5_claims <- na_5 %>%
select(text)
Change classes using as.character() to create a vector Get sentiment scores using get_nrc_sentiment() Use data.frame() and colSums() to create a sentiment data frame
#Sentiment 1 Prep
na_1_claims_vector <- as.character(na_1_claims$text)
na_1_sentiment <- get_nrc_sentiment(na_1_claims_vector)
## Warning: `spread_()` was deprecated in tidyr 1.2.0.
## ℹ Please use `spread()` instead.
## ℹ The deprecated feature was likely used in the syuzhet package.
## Please report the issue to the authors.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
na_1_sentiment_score <- data.frame(colSums(na_1_sentiment[,]))
names(na_1_sentiment_score) <- 'score'
na_1_sentiment_score <- cbind("sentiment" = rownames(na_1_sentiment_score), na_1_sentiment_score)
#Sentiment 3 Prep
na_3_claims_vector <- as.character(na_3_claims$text)
na_3_sentiment <- get_nrc_sentiment(na_3_claims_vector)
na_3_sentiment_score <- data.frame(colSums(na_3_sentiment[,]))
names(na_3_sentiment_score) <- 'score'
na_3_sentiment_score <- cbind("sentiment"=rownames(na_3_sentiment_score), na_3_sentiment_score)
#Sentiment 5 Prep
na_5_claims_vector <- as.character(na_5_claims$text)
na_5_claims_sentiment <- get_nrc_sentiment(na_5_claims_vector)
na_5_claims_sentiment_score <- data.frame(colSums(na_5_claims_sentiment[,]))
names(na_5_claims_sentiment_score) <- 'score'
na_5_claims_sentiment_score <- cbind("sentiment" = rownames(na_5_claims_sentiment_score), na_5_claims_sentiment_score)
Use full_join() to combine the sentiment score data frames together into one data frame rename() the new columns
united_sent_score <- full_join(na_1_sentiment_score, na_3_sentiment_score, by = "sentiment") %>%
full_join(na_5_claims_sentiment_score, by = "sentiment") %>% rename(claim_1 = score.x, claim_3 = score.y, claim_5 = score) #%>%
#transmute(sentiment, score = score.x + score.y + score)
sd <- ggplot(united_sent_score, aes(x = sentiment)) +
geom_col(aes(y = claim_5, fill = "claim_5"), position = "stack") +
geom_col(aes(y = claim_1, fill = "claim_1"), position = "stack") +
geom_col(aes(y = claim_3, fill = "claim_3"), position = "stack") +
theme_minimal()+
scale_fill_manual(values = c("claim_5" = "#08589E", "claim_1" = "#4EB3D3", "claim_3" = "#A8DDB5")) +
theme(text = element_text(family = "Arial", size = 21), axis.text.y = element_text(family = "Arial", size = 21), axis.title = element_text(family = "Arial", size = 30), plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5))+
labs(x = "Sentiment", y = "Score", title = "Sentiment Scores", subtitle = "Super Claims 1, 3, & 5", fill = "Claim")
ggplotly(sd)
#Visual 2
na_5_claims <- na_5 %>%
select(text)
ngrams_5 <- na_5_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_5 <- ngrams_5 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_5 <- ngrams_5 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_5 <- ngrams_5 %>%
unite(bigram, word1, word2, sep = " ")
ngrams_5 <- ngrams_5 %>%
count(bigram, sort = TRUE)
# Select the desired color palette
color_palette <- brewer.pal(9, "GnBu")
# Darken the colors by multiplying RGB values by 0.8
dark_color_palette <- sapply(color_palette, function(color) {
rgb <- col2rgb(color)
darker_rgb <- rgb * .8
rgb(darker_rgb[1], darker_rgb[2], darker_rgb[3], maxColorValue = 300)
})
# Use the modified color palette in the wordcloud function
wordcloud(ngrams_5$bigram, freq = ngrams_5$n, max.words = 200, min.freq = 5,
random.order = FALSE, colors = dark_color_palette, family = "Avenir")
#Visual 3
na_3_corpus <- corpus(na_3$text)
toks <- na_3_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5, edge_color = "#7BCCC4")
Create bigram
na_3_claims <- na_3 %>%
select(text)
ngrams_3 <- na_3_claims %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
ngrams_3 <- ngrams_3 %>%
separate(bigram, c("word1", "word2"), sep = " ")
ngrams_3 <- ngrams_3 %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
ngrams_3 <- ngrams_3 %>%
unite(bigrams, word1, word2, sep = " ")
ngrams_3 <- ngrams_3 %>%
count(bigrams, sort = TRUE)
ngrams_3_corpus <- corpus(ngrams_3$bigrams)
toks <- na_3_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_tolower() %>%
tokens_remove(pattern = stopwords("english"), padding = FALSE)
fcmat <- fcm(toks, context = "window", tri = FALSE)
feat <- names(topfeatures(fcmat, 30))
fcm_select(fcmat, pattern = feat) %>%
textplot_network(min_freq = 0.5, edge_color = "#7BCCC4")
#Visual 4 Created a color vector
color <- c("#08589E", "#4EB3D3", "#A8DDB5")
pyramid.plot(na_1_sentiment_score$score, na_3_sentiment_score$score, labels = na_1_sentiment_score$sentiment, main = "Sentiment Comparison for Claims 1 & 3", gap = 250, top.labels = c("Super Claim 1: CC is Not Happening", "Sentiment", "Super Claim 3: CC is Not Bad"), show.values = TRUE, unit = "Score", ppmar = c(8,4,8,4), lxcol = brewer.pal(5, "YlGnBu"), rxcol = brewer.pal(5, "YlGnBu"))
## 3363 3363
## [1] 5.1 4.1 4.1 2.1